using Pkg Pkg.activate(".") using DataFrames, CSV, DataFramesMeta, StatsPlots, Downloads, ProgressMeter, Weave, Loess, Arrow, Transducers, Memoization, ReverseDiff, Zygote, Turing, LinearAlgebra, ColorSchemes, Serialization
In this project, we will download some files from the CA dept of Education which give averaged results for test scores in all schools in CA across various student groups within the schoool.
The data is available at: https://caaspp-elpac.cde.ca.gov/caaspp/ResearchFileListSB?ps=true&lstTestYear=2021&lstTestType=B&lstCounty=00&lstDistrict=00000#dl
We are going to get the Los Angeles county files which avoids unnecessary data.
Unfortunately some years have different "versions" and there is no way on the site to get a list of files, so we need to just by hand click through and find out the file names, for example:
https://caaspp-elpac.cde.ca.gov/caaspp/researchfiles/sbca2015all19csv_v3.zip for 2014-2015 LA County
https://caaspp-elpac.cde.ca.gov/caaspp/researchfiles/sbca2021all19csv_v2.zip for 2020-2021 LA County data...
there is no 2019-2020 data as COVID prevented the testing from happening.
versions = [2015 => 3, 2016 => 3, 2017 => 2, 2018 => 3, 2019 => 4 , 2021 => 2] files = ["sb_ca$(vv[1])_all_19_csv_v$(vv[2]).zip" for vv in versions]
6-element Vector{String}:
"sb_ca2015_all_19_csv_v3.zip"
"sb_ca2016_all_19_csv_v3.zip"
"sb_ca2017_all_19_csv_v2.zip"
"sb_ca2018_all_19_csv_v3.zip"
"sb_ca2019_all_19_csv_v4.zip"
"sb_ca2021_all_19_csv_v2.zip"
We also need the entity file that describes the codes for each school/district etc and the codes for the tests and student group descriptions we'll assume for our analysis that none of the entities of interest changed during the time period and just get the recent ones.
push!(files,"sb_ca2021entities_csv.zip") push!(files,"Tests.zip") push!(files,"StudentGroups.zip")
9-element Vector{String}:
"sb_ca2015_all_19_csv_v3.zip"
"sb_ca2016_all_19_csv_v3.zip"
"sb_ca2017_all_19_csv_v2.zip"
"sb_ca2018_all_19_csv_v3.zip"
"sb_ca2019_all_19_csv_v4.zip"
"sb_ca2021_all_19_csv_v2.zip"
"sb_ca2021entities_csv.zip"
"Tests.zip"
"StudentGroups.zip"
We'll change directory into the data dir, download each of the files, unzip them, and then cd out of the directory.
We use try/finally to ensure that we wind up in the same directory even if there's an error, then read the directory to see what files we have...
try cd("data") @showprogress 3 for f in files if ! ispath(f) # if we don't have the file already Downloads.download("https://caaspp-elpac.cde.ca.gov/caaspp/researchfiles/$(f)",f) end run(`unzip -u $(f)`) end finally cd("..") end readdir("data")
Archive: sb_ca2015_all_19_csv_v3.zip
Archive: sb_ca2016_all_19_csv_v3.zip
Archive: sb_ca2017_all_19_csv_v2.zip
Archive: sb_ca2018_all_19_csv_v3.zip
Archive: sb_ca2019_all_19_csv_v4.zip
Archive: sb_ca2021_all_19_csv_v2.zip
Archive: sb_ca2021entities_csv.zip
Archive: Tests.zip
Archive: StudentGroups.zip
19-element Vector{String}:
"StudentGroups.txt"
"StudentGroups.zip"
"Tests.txt"
"Tests.zip"
"sb_ca2015_all_19_csv_v3.txt"
"sb_ca2015_all_19_csv_v3.zip"
"sb_ca2015_all_ascii_v3.zip"
"sb_ca2016_all_19_csv_v3.txt"
"sb_ca2016_all_19_csv_v3.zip"
"sb_ca2017_all_19_csv_v2.txt"
"sb_ca2017_all_19_csv_v2.zip"
"sb_ca2018_all_19_csv_v3.txt"
"sb_ca2018_all_19_csv_v3.zip"
"sb_ca2019_all_19_csv_v4.txt"
"sb_ca2019_all_19_csv_v4.zip"
"sb_ca2021_all_19_csv_v2.txt"
"sb_ca2021_all_19_csv_v2.zip"
"sb_ca2021entities_csv.txt"
"sb_ca2021entities_csv.zip"
Next, we need to read the data in, and try to subset down to the schools of interest. Unfortunately, the files don't all have the same columns, nor do they have the same delimeters! What do we do about that? DataFrames allows us to append!(df,df2) and handle the case where they don't have exactly the same columns by various methods by specifying the "cols" option (see docs) cols=:union causes the resulting dataset to have all the columns from both files, but with missing values where needed
intify(s::Int64) = s function intify(s) x = tryparse(Int64,s) if isnothing(x) missing else x end end floatormiss(x::Float64) = x function floatormiss(x) if ismissing(x) return missing end if isa(x,String) let v = tryparse(Float64,x); if isnothing(v) missing else v end end end end testscores = DataFrame() entities = DataFrame() tests = DataFrame() students = DataFrame() const cachedir = "/var/cache/userdata/dlakelan/juliacache" if ! ispath(joinpath(cachedir,"tests.arrow")) # there are no cache files, build them. for f in filter(endswith(".txt"),readdir("data")) @show f if occursin("entities",f) global entities = CSV.read(joinpath("data",f),DataFrame; normalizenames=true, delim="^") elseif occursin("Student",f) global students = CSV.read(joinpath("data",f),DataFrame; normalizenames=true,delim="^") elseif occursin("Tests",f) global tests = CSV.read(joinpath("data",f),DataFrame; normalizenames=true,delim="^") else df = CSV.read(joinpath("data",f),DataFrame; normalizenames=true) # uses a standard "," delimiter append!(testscores,df; cols=:union) # we will collect all the various columns with missing data where they don't exist end end Arrow.write(joinpath(cachedir,"tests.arrow"),tests) Arrow.write(joinpath(cachedir,"students.arrow"),students) Arrow.write(joinpath(cachedir,"entities.arrow"),entities) Arrow.write(joinpath(cachedir,"testscores.arrow"),testscores) end # now we know we have cache files. read them (note doing it this way means we're always analyzing the data from the cache) tests = DataFrame(Arrow.Table(joinpath(cachedir,"tests.arrow"))) students = DataFrame(Arrow.Table(joinpath(cachedir,"students.arrow"))) entities = DataFrame(Arrow.Table(joinpath(cachedir,"entities.arrow"))) # the dataframe for test scores is huge, 3.8 million rows and a lot of columns, we only care about these districts. # we run the rows of the Arrow.Table through a Filter transducer that filters on District_Code and collect the ones we care about, saving gigabytes # of RAM ourentities = @subset(entities,in.(:District_Name, Ref(["Pasadena Unified","Glendale Unified", "Alhambra Unified"]))) #@show ourentities let ourdistcodes = unique(ourentities.District_Code); global testscores = Tables.rows(Arrow.Table(joinpath(cachedir,"testscores.arrow"))) |> Filter(x -> !ismissing(x.District_Code) && x.District_Code in ourdistcodes && x.School_Code .!= 0) |> DataFrame @rtransform!(testscores, :Students_Tested = if ismissing(:Students_Tested) intify(:Total_Tested_at_Subgroup_Level) else intify(:Students_Tested) end, :CAASPP_Reported_Enrollment = intify(:CAASPP_Reported_Enrollment), :Total_Tested_At_Entity_Level = intify(:Total_Tested_At_Entity_Level), :Mean_Scale_Score = floatormiss(:Mean_Scale_Score)) end "Done Loading Data..."
"Done Loading Data..."
pusdentities = @subset(entities,.! ismissing.(:District_Name) .&& :District_Name .== "Pasadena Unified" .&& .! ismissing.(:School_Name)) alhambraentities = @subset(entities,.! ismissing.(:District_Name) .&& :District_Name .== "Alhambra Unified" .&& .! ismissing.(:School_Name)) glendaleentities = @subset(entities,.! ismissing.(:District_Name) .&& :District_Name .== "Glendale Unified" .&& .! ismissing.(:School_Name)) pusdtests = leftjoin(@select(pusdentities,:School_Code,:School_Name,:District_Name),testscores,on = :School_Code, matchmissing=:notequal) alhambratests = leftjoin(@select(alhambraentities,:School_Code,:School_Name,:District_Name),testscores,on = :School_Code, matchmissing=:notequal) glendaletests = leftjoin(@select(glendaleentities,:School_Code,:School_Name,:District_Name),testscores,on = :School_Code, matchmissing=:notequal) "Done subsetting..." # so we don't output the above table
"Done subsetting..."
We want points for each test... TestID = 1 means english, 2 means Math (for "smarter balanced" assessment). Let's group the tests by cohort, meaning TestYear - (Grade - 3) basically the year they were in 3rd grade which is the first year you take the test.
function plotschools(entities,testscores) for (testid,testname) in Iterators.zip([1,2],["English","Math"]) let pl = [] for sch in eachrow(entities) ourdf = @subset(testscores,:School_Code .== sch.School_Code .&& :Subgroup_ID .== 1 .&& :Grade .< 13 .&& :Test_Id .== testid .&& .!ismissing.(:Mean_Scale_Score)) # @show ourdf if nrow(ourdf) > 0 p = @df ourdf plot(:Grade,:Mean_Scale_Score; xlim=(3,12),ylim = (2250,2750),group=:Test_Year .- (:Grade .- 3),legend=false,size=(250,250), title="$(testname): $(sch.School_Name)",linewidth=3) p = @df ourdf scatter!(:Grade,:Mean_Scale_Score; xlim=(3,12),ylim = (2250,2750),group=:Test_Year .- (:Grade .- 3),legend=false,size=(250,250), title="$(testname): $(sch.School_Name)",markerstrokewidth=0) push!(pl,p) end end display(plot(pl...; size=(1500,1500))) end end end plotschools(pusdentities,@subset(pusdtests,:Subgroup_ID .== 1)) plotschools(alhambraentities,@subset(alhambratests,:Subgroup_ID .== 1))
Clearly there are some differences between schools. Since we are looking at the average across the school some of this will be because the demographic and socioeconomic mix of the students is different. For example the middle schoolers at the relative wealthy community of Sierra Madre Middle School are testing about the same as the high school 11th graders at Blair and Marshall Fundamental.
Also Alhambra schools appear to have higher achievement in general.
A considerable difference in overall average test scores can be attributed to a different mix of students. So let's break down the schools by demographic groups. The DemographicID in the students table describes the StudentGroups. We can break this down by:
"Economically Disadvantaged" vs "Not economically disadvantaged"
Race and ethnicity
Parent Education Level
For now, let's focus on math scores, and we'll iterate over every school, and output a graph that shows the average curve for each parent education level:
function plotedlevel(schools,scores,dist) edlevnames = Dict(90 => "No HSD", 91 => "HSD", 92 => "Some College", 93 => "College Grad", 94=>"Grad School") for sch in eachrow(schools) ourdf = @subset(scores,:Test_Id .== 2 .&& :School_Code .== sch.School_Code .&& in.(:Subgroup_ID ,Ref(90:94))) p = @df ourdf scatter(:Grade,:Mean_Scale_Score; xlim=(2.5,12), ylim=(2300,2800),title="$(dist)\nMath $(sch.School_Name)\nBy Parent Ed",label=false,markersize=3,size=(500,500)) for edlev in 90:94 subs = @subset(ourdf,:Subgroup_ID .== edlev .&& .!ismissing.(:Grade) .&& .! ismissing.(:Mean_Scale_Score)) if nrow(subs) < 4 continue end #println("There are $(nrow(subs)) observations for $(sch.School_Name)") grades = collect(minimum(subs.Grade):maximum(subs.Grade)) if length(grades) > 2 l = loess(Float64.(subs.Grade),Float64.(subs.Mean_Scale_Score)) p = plot!(grades,map(x -> Loess.predict(l,x), Float64.(grades)); label=edlevnames[edlev], linewidth=3) end end display(p) end end plotedlevel(pusdentities,pusdtests,"PUSD") plotedlevel(alhambraentities,alhambratests,"Alhambra") plotedlevel(glendaleentities,glendaletests,"Glendale")
function plotdistrictedlev(tests,name) edlevnames = Dict(90 => "No HSD", 91 => "HSD", 92 => "Some College", 93 => "College Grad", 94=>"Grad School") p = @df tests scatter(:Grade,:Mean_Scale_Score; xlim=(2.5,12), ylim=(2300,2800), title="$(name)\nMath Scores") for edlev in keys(edlevnames) subs = @subset(tests, :Subgroup_ID .== edlev .&& .! ismissing.(:Grade) .&& .! ismissing.(:Mean_Scale_Score)) grades = collect(3:11) l = loess(Float64.(subs.Grade), Float64.(subs.Mean_Scale_Score)) p = plot!(grades,map(x -> Loess.predict(l,x), Float64.(grades)); label = edlevnames[edlev],linewidth=3) end display(p) p end p1 = plotdistrictedlev(@subset(pusdtests,:Test_Id .== 2),"PUSD") p2 = plotdistrictedlev(@subset(alhambratests,:Test_Id .== 2),"Alhambra") p3 = plotdistrictedlev(@subset(glendaletests,:Test_Id .== 2),"Glendale") plot(p1,p2,p3; layout=(1,3),legend=:topleft,size=(2000,500))
One reason to create a mathematical model is that it can simplify your understanding of a problem. For example a drag coefficient is a kind of summary of the entire pressure field on the surface of an object... integrated together it results in a certain amount of drag.
In our case we will assume that variation from year to year is low, so that we can pool the years together, and get a summary of how quickly students are learning math, by fitting a line through different groups of grades: (3,4,5) (6,7,8) and the level at grade 11 we can summarize the general trend in each school as just 5 numbers (levels and slopes of each group), rather than the details of potentially hundreds of measurements.
We will assume the level and slope of these three lines depends on parents education level only, since we don't have individual student data with covariates where we could fit a more complicated model.
@model function schoolmod(school,parented,grade,meanscore,nschools,nedlev) c4avg ~ Normal(2450.0,200.0) c7avg ~ Normal(2500.0,200.0) c11avg ~ Normal(2550.0,200.0) m4avg ~ Normal(0.0,100.0) m7avg ~ Normal(0.0,100.0) msd ~ Gamma(10.0,20.0/9.0) csd ~ Gamma(10.0,20.0/9.0) c4 ~ MvNormal(repeat([0.0],nschools),csd^2*I(nschools)) c7 ~ MvNormal(repeat([0.0],nschools),csd^2*I(nschools)) c11 ~ MvNormal(repeat([0.0],nschools),csd^2*I(nschools)) m4 ~ MvNormal(repeat([0.0],nschools),msd^2*I(nschools)) m7 ~ MvNormal(repeat([0.0],nschools),msd^2*I(nschools)) edlevmult ~ filldist(Gamma(20.0,1.0/19),nschools) cedlev4 ~ MvNormal(repeat([0.0],nedlev),Diagonal([50.0,50.0,50.0,50.0,50.0].^2)) cedlev7 ~ MvNormal(repeat([0.0],nedlev),Diagonal([50.0,50.0,50.0,50.0,50.0].^2)) cedlev11 ~ MvNormal(repeat([0.0],nedlev),Diagonal([50.0,50.0,50.0,50.0,50.0].^2)) medlev4 ~ MvNormal(repeat([0.0],nedlev),Diagonal([20.0,20.0,20.0,20.0,20.0].^2)) medlev7 ~ MvNormal(repeat([0.0],nedlev),Diagonal([20.0,20.0,20.0,20.0,20.0].^2)) s ~ Gamma(10.0,50/10.0) preds = zeros(eltype(s),length(meanscore)) for (i,(s,ed,g,sc)) in Iterators.enumerate(Iterators.zip(school,parented,grade,meanscore)) if ed == 1 # base case med4 = med7 = ced4 = ced7 = ced11 = zero(eltype(cedlev4)) else ced4 = cedlev4[ed] * edlevmult[s] ced7 = cedlev7[ed] * edlevmult[s] ced11 = cedlev11[ed] * edlevmult[s] med4 = medlev4[ed] * edlevmult[s] med7 = medlev7[ed] * edlevmult[s] end if g in (3,4,5) preds[i] = (c4avg + c4[s] + ced4) + (m4avg + m4[s] + med4) * (g-4) elseif g in (6,7,8) preds[i] = (c7avg + c7[s] + ced7) + (m7avg + m7[s] + med7) * (g-7) elseif g == 11 preds[i] = (c11avg + c11[s] + ced11) end end meanscore ~ MvNormal(preds,s^2 * I(length(preds))) end edlevtests = @chain begin @subset(testscores,.!ismissing.(:Mean_Scale_Score) .&& :Test_Id .== 2 .&& :School_Code .!= 0 .&& in.(:Subgroup_ID,Ref((90,91,92,93,94)))) @transform(:Edlev = :Subgroup_ID .- 89) @transform(:Score = Float64.(:Mean_Scale_Score)) end schoolids = map(Pair,edlevtests.District_Code,edlevtests.School_Code) uniqueids = unique(schoolids) schoolnames = Dict() schooldists = Dict() schoolgrades = Dict() for (i,k) in Iterators.enumerate(uniqueids) schoolnames[i] = "Unknown" schoolnames[k] = "Unknown" for g in 1:11 schoolgrades[k => g] = false end schooldists[k] = "Unknown" schooldists[i] = "Unknown" end for r in eachrow(entities) schoolnames[r.District_Code => r.School_Code] = r.School_Name schooldists[r.District_Code => r.School_Code] = r.District_Name for grade in 1:11 schoolgrades[(r.District_Code => r.School_Code) => grade ] = false end end for i in 1:length(uniqueids) schooldists[i] = schooldists[uniqueids[i]] end for r in eachrow(testscores) schoolgrades[(r.District_Code => r.School_Code) => r.Grade ] = true end schoolDict = Dict() merge!(schoolDict,Dict(map(Pair,uniqueids,1:length(uniqueids)))) merge!(schoolDict,Dict(map(Pair,1:length(uniqueids),uniqueids))) for i in 1:length(uniqueids) schoolnames[i] = haskey(schoolnames,schoolDict[i]) ? schoolnames[schoolDict[i]] : "Unknown"; end smod = schoolmod(map(x -> schoolDict[x],schoolids),edlevtests.Edlev,edlevtests.Grade,edlevtests.Score,length(uniqueids),5) Turing.setadbackend(:reversediff) Turing.setrdcache(true) if ispath("cache/mcmcchain.dat") ch = deserialize("cache/mcmcchain.dat") else ch = sample(smod,NUTS(900,.8),MCMCThreads(),500,2) end "Done sampling model..."
"Done sampling model..."
Now that we have fit our model, we have hundreds of samples of hundreds of parameters. Let's take a look at their distribution
for sch in 1:length(uniqueids) dist = schooldists[sch] p = plot(; linewidth=3, title = "$dist : $(schoolnames[sch])", xlim=(2.5,12),ylim=(2300,2700),legend=false) c4avg = mean(ch[:,Symbol("c4avg"),:]) m4avg = mean(ch[:,Symbol("m4avg"),:]) c7avg = mean(ch[:,Symbol("c7avg"),:]) m7avg = mean(ch[:,Symbol("m7avg"),:]) c11avg = mean(ch[:,Symbol("c11avg"),:]) edlevcolors = colorschemes[:tol_light] # a RG colorblind friendly pallette has4 = schoolgrades[schoolDict[sch] => 4] has7 = schoolgrades[schoolDict[sch] => 7] has11 = schoolgrades[schoolDict[sch] => 11] for ed in 1:5 c4 = mean(ch[:,Symbol("c4[$sch]"),:] + ch[:,Symbol("cedlev4[$ed]"),:] .* ch[:,Symbol("edlevmult[$sch]"),:]) + c4avg m4 = mean(ch[:,Symbol("m4[$sch]"),:] + ch[:,Symbol("medlev4[$ed]"),:] .* ch[:,Symbol("edlevmult[$sch]"),:]) + m4avg c7 = mean(ch[:,Symbol("c7[$sch]"),:] + ch[:,Symbol("cedlev7[$ed]"),:] .* ch[:,Symbol("edlevmult[$sch]"),:]) + c7avg m7 = mean(ch[:,Symbol("m7[$sch]"),:] + ch[:,Symbol("medlev7[$ed]"),:] .* ch[:,Symbol("edlevmult[$sch]"),:]) + m7avg c11 = mean(ch[:,Symbol("c11[$sch]"),:] + ch[:,Symbol("cedlev11[$ed]"),:] .* ch[:,Symbol("edlevmult[$sch]"),:]) + c11avg al = has4 ? 1.0 : .1 p = plot!([3,5], [c4-m4,c4+m4]; linewidth=6, color=edlevcolors[ed], xlim=(2.5,12),ylim=(2300,2800),legend=false, alpha=al) al = has7 ? 1.0 : .1 p = plot!([6,8], [c7-m7,c7+m7]; linewidth=6, color=edlevcolors[ed],alpha=al) al = has11 ? 1.0 : .1 p = scatter!([11], [c11]; markersize=6,alpha=al,color=edlevcolors[ed]) for i in 1:10 edmult = ch[i,Symbol("edlevmult[$sch]"),1] j = rand(1:length(ch)) c = ch[j,Symbol("c4[$sch]"),1] + ch[j,Symbol("cedlev4[$ed]"),1] * edmult + c4avg m = ch[j,Symbol("m4[$sch]"),1] + ch[j,Symbol("medlev4[$ed]"),1] * edmult + m4avg p = plot!([3,5],[c - m, c + m],label=false,alpha=.2,color=edlevcolors[ed]) c = ch[j,Symbol("c7[$sch]"),1] + ch[j,Symbol("cedlev7[$ed]"),1] * edmult + c7avg m = ch[j,Symbol("m7[$sch]"),1] + ch[j,Symbol("medlev7[$ed]"),1] * edmult + m7avg p = plot!([6,8],[c - m, c + m],label=false,alpha=.2,color=edlevcolors[ed]) c11 = ch[j,Symbol("c11[$sch]"),1] + ch[j,Symbol("cedlev11[$ed]"),1] * edmult + c11avg p = scatter!([randn()*.2 + 11],[c11],label=false,alpha=.2,color=edlevcolors[ed]) end end display(p) end
let pp = [] for (i,sch) in Iterators.enumerate(uniqueids) p1 = density(ch[:,Symbol("c4[$i]"),:]; title = "$(schoolnames[i]) c4",fill=true,alpha=.5,xlim=(-100,100)) p2 = density(ch[:,Symbol("c7[$i]"),:]; title = "$(schoolnames[i]) c7",fill=true,alpha=.5,xlim=(-100,100)) p3 = density(ch[:,Symbol("c11[$i]"),:]; title = "$(schoolnames[i]) c11",fill=true,alpha=.5,xlim=(-100,100)) p4 = density(ch[:,Symbol("m4[$i]"),:]; title = "$(schoolnames[i]) m4",fill=true,alpha=.5,xlim=(-50,50)) p5 = density(ch[:,Symbol("m7[$i]"),:]; title = "$(schoolnames[i]) m7",fill=true,alpha=.5,xlim=(-50,50)) p6 = density(ch[:,Symbol("edlevmult[$i]"),:]; title = "$(schoolnames[i]) EdLevMult",fill=true,alpha=.5,xlim=(0,2)) push!(pp,plot(p1,p2,p3,p4,p5,p6;layout =(1,6),size=(1000,1000),titlefontsize=8)) end for ppp in Iterators.partition(pp,4) display(plot(ppp...; size=(1200,1200),layout=(4,1))) end end
for (i,sch) in Iterators.enumerate(uniqueids) schdat = @subset(testscores,:Grade .< 12 .&& :District_Code .== sch[1] .&& :School_Code .== sch[2] .&& isa.(:Students_Tested,Int64) .&& :Test_Id .== 2 .&& in.(:Subgroup_ID, Ref((90,91,92,93,94,95)))) schdat = @by(schdat,:Subgroup_ID, :ntested = sum(:Students_Tested)) ntot = sum(schdat.ntested) p = @df schdat bar(:Subgroup_ID , :ntested ./ ntot ; legend=false,title="$(schoolnames[sch])\nFraction at Parent Education Level") display(p) end